date: 2018-10-27
R setup
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ dplyr 0.7.4
## ✔ tidyr 0.8.0 ✔ stringr 1.3.0
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
library(knitr)
library(readxl)
library(moments)
library(epiR)
## Warning: package 'epiR' was built under R version 3.4.4
## Loading required package: survival
## Package epiR 0.9-97 is loaded
## Type help(epi.about) for summary information
##
library(e1071)
##
## Attaching package: 'e1071'
## The following objects are masked from 'package:moments':
##
## kurtosis, moment, skewness
library(lsmeans)
## Warning: package 'lsmeans' was built under R version 3.4.4
## The 'lsmeans' package is being deprecated.
## Users are encouraged to switch to 'emmeans'.
## See help('transition') for more information, including how
## to convert 'lsmeans' objects and scripts to work with 'emmeans'.
library(lmSupport)
## Warning: package 'lmSupport' was built under R version 3.4.4
library(ppcor)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(tableone)
## Warning: package 'tableone' was built under R version 3.4.4
library(sjPlot)
## Warning in checkMatrixPackageVersion(): Package version inconsistency detected.
## TMB was built with Matrix version 1.2.12
## Current Matrix version is 1.2.13
## Please re-install 'TMB' from source using install.packages('TMB', type = 'source') or ask CRAN for a binary version of 'TMB' matching CRAN's 'Matrix' package
## #refugeeswelcome
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.1. https://CRAN.R-project.org/package=stargazer
library(car)
## Warning: package 'car' was built under R version 3.4.4
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
library(leaps)
Load data
rm(list = ls())
## define directory
analysisDir <- '/Users/matthewmoll/Documents/Fellowship/MPH/Fall2018/BST213_regression/homework/'
## Read in data
cost <- read_excel("/Users/matthewmoll/Documents/Fellowship/MPH/Fall2018/BST213_regression/homework/cost.xls")
Data structure
head(cost)
## # A tibble: 6 x 14
## outcosts2 ptage ssi panic anxiety married female educat iadl social
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 83. 38.9 1.00 0. 0. 1. 1. 5. 100. 100.
## 2 2162. 53.3 1.00 0. 0. 0. 1. 2. 91.7 100.
## 3 2203. 42.7 1.08 0. 0. 0. 1. 1. 75.0 100.
## 4 0. 70.2 1.15 0. 0. 0. 1. 1. 46.7 100.
## 5 4050. 41.1 1.00 0. 0. 1. 1. 3. 100. 100.
## 6 815. 38.4 1.15 0. 0. 0. 1. 2. 83.3 100.
## # ... with 4 more variables: racecat <dbl>, whiteleycat <dbl>,
## # charlson <dbl>, depcat <dbl>
str(cost)
## Classes 'tbl_df', 'tbl' and 'data.frame': 461 obs. of 14 variables:
## $ outcosts2 : num 83 2162 2203 0 4050 ...
## $ ptage : num 38.9 53.3 42.7 70.2 41.1 ...
## $ ssi : num 1 1 1.08 1.15 1 ...
## $ panic : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anxiety : num 0 0 0 0 0 0 0 0 0 0 ...
## $ married : num 1 0 0 0 1 0 1 0 0 0 ...
## $ female : num 1 1 1 1 1 1 0 1 0 0 ...
## $ educat : num 5 2 1 1 3 2 5 3 2 2 ...
## $ iadl : num 100 91.7 75 46.7 100 ...
## $ social : num 100 100 100 100 100 ...
## $ racecat : num 4 2 2 1 1 2 4 2 1 1 ...
## $ whiteleycat: num 1 1 1 1 1 1 1 1 1 1 ...
## $ charlson : num 0 1 1 2 0 3 0 2 0 1 ...
## $ depcat : num 3 3 3 3 3 3 3 3 3 3 ...
sapply(cost, summary)
## $outcosts2
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 307 1016 1836 2463 11909
##
## $ptage
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 18.96 32.24 44.62 45.16 55.21 89.68 1
##
## $ssi
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 1.231 1.538 1.738 2.077 5.000 6
##
## $panic
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.06291 0.00000 1.00000
##
## $anxiety
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.06508 0.00000 1.00000
##
## $married
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.0000 0.0000 0.4096 1.0000 1.0000 2
##
## $female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 0.744 1.000 1.000
##
## $educat
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 2.000 3.000 3.359 5.000 5.000 1
##
## $iadl
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 72.22 94.44 81.82 100.00 100.00
##
## $social
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 100.00 100.00 89.86 100.00 100.00 2
##
## $racecat
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 2.297 4.000 4.000
##
## $whiteleycat
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 1.456 2.000 3.000
##
## $charlson
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.5662 1.0000 7.0000
##
## $depcat
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 3.00 3.00 2.72 3.00 3.00
Define variables
The goal for this assignment is to build a linear regression model using the priniciples discussed in class, and then write it into a methods and results section of a manuscript.
removevars <- c()
## Define response variables
responseVars <- c('outcosts2')
responseVars
## [1] "outcosts2"
## Define explanatory variables
explanVars <- names(cost)[!names(cost) %in% c("outcosts2")]
explanVars
## [1] "ptage" "ssi" "panic" "anxiety" "married"
## [6] "female" "educat" "iadl" "social" "racecat"
## [11] "whiteleycat" "charlson" "depcat"
## List of quantitative variables
quantVars <- names(cost[!names(cost) %in% removevars &
sapply(cost, function(x)
length(levels(as.factor(x)))>7)])
quantVars
## [1] "outcosts2" "ptage" "ssi" "iadl" "social" "charlson"
## List of binary variables
binVars <- names(cost[!names(cost) %in% removevars &
sapply(cost, function(x)
length(levels(as.factor(x)))==2)])
binVars
## [1] "panic" "anxiety" "married" "female"
## List of categorical variables
catVars <- names(cost[!names(cost) %in% c(removevars,binVars) &
sapply(cost, function(x)
length(levels(as.factor(x)))<=7 & length(levels(as.factor(x)))>1)])
catVars
## [1] "educat" "racecat" "whiteleycat" "depcat"
## Get Intersection between quantVars and explanVars and responseVars
explanVars.quant <- intersect(explanVars,quantVars)
responseVars.quant <- intersect(responseVars,quantVars)
## Get intersection between categorical and binary variables with explanatory and response vars
explanVars.cat <- intersect(explanVars,catVars)
explanVars.cat <- explanVars.cat[!explanVars.cat %in% c(binVars)]
responseVars.cat <- intersect(responseVars, catVars)
responseVars.cat <- responseVars.cat[!responseVars.cat %in% c(binVars)]
## Get binary explanatory and response variables
explanVars.bin <- intersect(explanVars, binVars)
responseVars.bin <- intersect(responseVars, binVars)
## View data ranges
str(cost)
## Classes 'tbl_df', 'tbl' and 'data.frame': 461 obs. of 14 variables:
## $ outcosts2 : num 83 2162 2203 0 4050 ...
## $ ptage : num 38.9 53.3 42.7 70.2 41.1 ...
## $ ssi : num 1 1 1.08 1.15 1 ...
## $ panic : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anxiety : num 0 0 0 0 0 0 0 0 0 0 ...
## $ married : num 1 0 0 0 1 0 1 0 0 0 ...
## $ female : num 1 1 1 1 1 1 0 1 0 0 ...
## $ educat : num 5 2 1 1 3 2 5 3 2 2 ...
## $ iadl : num 100 91.7 75 46.7 100 ...
## $ social : num 100 100 100 100 100 ...
## $ racecat : num 4 2 2 1 1 2 4 2 1 1 ...
## $ whiteleycat: num 1 1 1 1 1 1 1 1 1 1 ...
## $ charlson : num 0 1 1 2 0 3 0 2 0 1 ...
## $ depcat : num 3 3 3 3 3 3 3 3 3 3 ...
sapply(cost, summary)
## $outcosts2
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 307 1016 1836 2463 11909
##
## $ptage
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 18.96 32.24 44.62 45.16 55.21 89.68 1
##
## $ssi
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 1.231 1.538 1.738 2.077 5.000 6
##
## $panic
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.06291 0.00000 1.00000
##
## $anxiety
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.06508 0.00000 1.00000
##
## $married
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.0000 0.0000 0.4096 1.0000 1.0000 2
##
## $female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 0.744 1.000 1.000
##
## $educat
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 2.000 3.000 3.359 5.000 5.000 1
##
## $iadl
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 72.22 94.44 81.82 100.00 100.00
##
## $social
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 100.00 100.00 89.86 100.00 100.00 2
##
## $racecat
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 2.297 4.000 4.000
##
## $whiteleycat
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 1.456 2.000 3.000
##
## $charlson
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.5662 1.0000 7.0000
##
## $depcat
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 3.00 3.00 2.72 3.00 3.00
Normality assessment
cost_quant <- as.data.frame(cost %>% dplyr::select(quantVars))
normalityfun <- function(dataset) {
require(moments)
require(e1071)
print(paste0("Summary Statistics: ", names(dataset)[i]))
print(paste0("Mean: ", mean(as.numeric(dataset[,i]), na.rm = T)))
print(paste0("Standard Deviation: ",sd(as.numeric(dataset[,i]), na.rm = T)))
print(paste0("Median: ", median(as.numeric(dataset[,i]), na.rm = T)))
print(paste0("Skewness: ", skewness(as.numeric(dataset[,i]), na.rm = T)))
print(paste0("Kurtosis: ", e1071::kurtosis(as.numeric(dataset[,i]), type = 2)))
print(paste0("Shapiro-Wilk Test: ", shapiro.test(as.numeric(dataset[,i]))$p.value))
print(" ")
print(" ")
qqnorm(as.numeric(dataset[,i]), main = paste0("Normal Q-Q plot: ",names(dataset)[i]))
qqline(as.numeric(dataset[,i]), col = 2)
}
for(i in 1:length(quantVars)) {
print(ggplot() +
geom_histogram(data = cost_quant, fill = "blue", alpha = 0.5, mapping =
aes(x = get(quantVars[i]))) +
xlab(quantVars[i]) + labs(title = "Histogram")
)
normalityfun(cost_quant)
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## [1] "Summary Statistics: outcosts2"
## [1] "Mean: 1836.48373101952"
## [1] "Standard Deviation: 2195.4502208015"
## [1] "Median: 1016"
## [1] "Skewness: 1.86911992818483"
## [1] "Kurtosis: 3.71673113110437"
## [1] "Shapiro-Wilk Test: 1.32694421506979e-24"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).

## [1] "Summary Statistics: ptage"
## [1] "Mean: 45.1636818141237"
## [1] "Standard Deviation: 15.4858207820523"
## [1] "Median: 44.621492128679"
## [1] "Skewness: 0.422861613908007"
## [1] "Kurtosis: NA"
## [1] "Shapiro-Wilk Test: 9.58087935521775e-08"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 6 rows containing non-finite values (stat_bin).

## [1] "Summary Statistics: ssi"
## [1] "Mean: 1.73845513460898"
## [1] "Standard Deviation: 0.693705411185324"
## [1] "Median: 1.53846153846154"
## [1] "Skewness: 1.35710574430674"
## [1] "Kurtosis: NA"
## [1] "Shapiro-Wilk Test: 1.83131200837982e-19"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## [1] "Summary Statistics: iadl"
## [1] "Mean: 81.8185104844541"
## [1] "Standard Deviation: 25.5994105151121"
## [1] "Median: 94.4444444444444"
## [1] "Skewness: -1.54182530496445"
## [1] "Kurtosis: 1.54732926242677"
## [1] "Shapiro-Wilk Test: 3.82337675082766e-26"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (stat_bin).

## [1] "Summary Statistics: social"
## [1] "Mean: 89.8571774388768"
## [1] "Standard Deviation: 22.5952089828678"
## [1] "Median: 100"
## [1] "Skewness: -2.50691098205916"
## [1] "Kurtosis: NA"
## [1] "Shapiro-Wilk Test: 1.85402296574893e-33"
## [1] " "
## [1] " "

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## [1] "Summary Statistics: charlson"
## [1] "Mean: 0.566160520607375"
## [1] "Standard Deviation: 1.07464059120437"
## [1] "Median: 0"
## [1] "Skewness: 2.57120849300461"
## [1] "Kurtosis: 8.03575657287451"
## [1] "Shapiro-Wilk Test: 1.66716672242903e-31"
## [1] " "
## [1] " "

The only variable that looks close to normal is patient age, which we may be able to use parametric testing by invoking the central limit theorem. Otherwise, we will report median and interquartile ranges.
Missingness
## assess for and remove > 10% missingness
## Set threshold
thr <- 0.1*length(cost$outcosts2)
sapply(cost, function(x) ifelse(sum(is.na(x)) > thr, 1, 0))
## outcosts2 ptage ssi panic anxiety married
## 0 0 0 0 0 0
## female educat iadl social racecat whiteleycat
## 0 0 0 0 0 0
## charlson depcat
## 0 0
sapply(cost,function(x) sum(is.na(x)))
## outcosts2 ptage ssi panic anxiety married
## 0 1 6 0 0 2
## female educat iadl social racecat whiteleycat
## 0 1 0 2 0 0
## charlson depcat
## 0 0
## Create complete dataset of phenotypic variables
cost <- na.omit(cost)
No variables were removed for missingness. However, ssi has the greatest missingness.
Table 1: Demographics
## Add labels to table
labelled::var_label(cost$outcosts2) <- "Outpatient costs ($)"
labelled::var_label(cost$ptage) <- "Age in years"
labelled::var_label(cost$ssi) <- "Somatic Symptom Index (No. (%))"
labelled::var_label(cost$panic) <- "Panic (No. (%))"
labelled::var_label(cost$anxiety) <- "Anxiety (No. (%))"
labelled::var_label(cost$married) <- "Married (No. (%))"
labelled::var_label(cost$female) <- "Female (No. (%))"
labelled::var_label(cost$educat) <- "Education Level (No. (%))"
labelled::var_label(cost$iadl) <- "IADL score"
labelled::var_label(cost$social) <- "Social Activities Score"
labelled::var_label(cost$racecat) <- "Race (No. (%))"
labelled::var_label(cost$whiteleycat) <- "Whiteley Category"
labelled::var_label(cost$charlson) <- "Charlson index"
labelled::var_label(cost$depcat) <- "Depression Category (No. (%))"
## Group variables
demovars <- names(cost)
demovars
## [1] "outcosts2" "ptage" "ssi" "panic" "anxiety"
## [6] "married" "female" "educat" "iadl" "social"
## [11] "racecat" "whiteleycat" "charlson" "depcat"
varsToFactor <- c(catVars, binVars)
nonNormalVars <- names(cost_quant)[!names(cost_quant) %in% c("ptage")]
## Table one
tableOne <- CreateTableOne(vars = demovars, data = cost, factorVars = varsToFactor, testNonNormal = nonNormalVars)
tableOne
##
## Overall
## n 449
## outcosts2 (mean (sd)) 1829.86 (2205.83)
## ptage (mean (sd)) 45.22 (15.55)
## ssi (mean (sd)) 1.74 (0.69)
## panic = 1 (%) 28 ( 6.2)
## anxiety = 1 (%) 28 ( 6.2)
## married = 1 (%) 184 (41.0)
## female = 1 (%) 332 (73.9)
## educat (%)
## 1 39 ( 8.7)
## 2 100 (22.3)
## 3 91 (20.3)
## 4 94 (20.9)
## 5 125 (27.8)
## iadl (mean (sd)) 81.80 (25.72)
## social (mean (sd)) 89.73 (22.78)
## racecat (%)
## 1 174 (38.8)
## 2 97 (21.6)
## 3 48 (10.7)
## 4 130 (29.0)
## whiteleycat (%)
## 1 297 (66.1)
## 2 100 (22.3)
## 3 52 (11.6)
## charlson (mean (sd)) 0.57 (1.07)
## depcat (%)
## 1 45 (10.0)
## 2 36 ( 8.0)
## 3 368 (82.0)
## Save as csv and word document
tableOne_out <- print(tableOne,quote = FALSE, noSpaces = TRUE, printToggle = FALSE, varLabel = TRUE, nonnormal = nonNormalVars)
write.csv(tableOne_out, file = paste0(analysisDir,"TableOne.csv"))
tableOne_out <- fread(paste0(analysisDir, "TableOne.csv"))
colnames(tableOne_out) <- c("","")
tab_df(tableOne_out, file = paste0(analysisDir, "TableOne.doc"))
|
|
|
|
n
|
449
|
|
Outpatient costs ($) (median [IQR])
|
998.00 [305.00, 2463.00]
|
|
Age in years (mean (sd))
|
45.22 (15.55)
|
|
Somatic Symptom Index (No. (%)) (median [IQR])
|
1.54 [1.23, 2.08]
|
|
Panic (No. (%)) = 1 (%)
|
28 (6.2)
|
|
Anxiety (No. (%)) = 1 (%)
|
28 (6.2)
|
|
Married (No. (%)) = 1 (%)
|
184 (41.0)
|
|
Female (No. (%)) = 1 (%)
|
332 (73.9)
|
|
Education Level (No. (%)) (%)
|
|
|
1
|
39 (8.7)
|
|
2
|
100 (22.3)
|
|
3
|
91 (20.3)
|
|
4
|
94 (20.9)
|
|
5
|
125 (27.8)
|
|
IADL score (median [IQR])
|
94.44 [72.22, 100.00]
|
|
Social Activities Score (median [IQR])
|
100.00 [100.00, 100.00]
|
|
Race (No. (%)) (%)
|
|
|
1
|
174 (38.8)
|
|
2
|
97 (21.6)
|
|
3
|
48 (10.7)
|
|
4
|
130 (29.0)
|
|
Whiteley Category (%)
|
|
|
1
|
297 (66.1)
|
|
2
|
100 (22.3)
|
|
3
|
52 (11.6)
|
|
Charlson index (median [IQR])
|
0.00 [0.00, 1.00]
|
|
Depression Category (No. (%)) (%)
|
|
|
1
|
45 (10.0)
|
|
2
|
36 (8.0)
|
|
3
|
368 (82.0)
|
Scatterplots and correlation coefficients
## Scatterplots and correlation coefficients
for (i in 1:length(explanVars.quant)) {
for (j in 1:length(responseVars.quant)) {
print(paste0("Explanatory var: ", explanVars.quant[i]))
print(paste0("Response var: ", responseVars.quant[j]))
cormod <- cor.test(as.data.frame(cost_quant)[,explanVars.quant[i]],
as.data.frame(cost_quant)[,responseVars.quant[j]])
print(paste0("Pearson Correlation Coefficient: ", signif(cormod$estimate,5)))
print(paste0("p-value: ", signif(cormod$p.value,5)))
print(" ")
print(" ")
print(ggplot(data = cost_quant) +
geom_point(mapping = aes(x = get(explanVars.quant[i]),
y = get(responseVars.quant[j]))) +
geom_smooth(mapping = aes(x = get(explanVars.quant[i]),
y = get(responseVars.quant[j])), method = 'lm') +
xlab(explanVars.quant[i]) + ylab(responseVars.quant[j]) +
labs(subtitle =
paste0(paste0("r-value: ", signif(cormod$estimate,5)),", ",
paste0("p-value: ", signif(cormod$p.value,5)))))
## plot residuals
print(ggplot(data = lm(get(responseVars.quant[j])~get(explanVars.quant[i]), data = cost_quant)) +
geom_point(aes(x = .fitted, y = .resid)) +
labs(title = paste0("Residuals vs fitted: ",explanVars.quant[i]))) # fitted vs residuals
print(ggplot(data = lm(get(responseVars.quant[j])~get(explanVars.quant[i]), data = cost_quant)) +
geom_histogram(aes(x = .resid)) +
labs(title = paste0("Residuals histogram: ",explanVars.quant[i])))
}
}
## [1] "Explanatory var: ptage"
## [1] "Response var: outcosts2"
## [1] "Pearson Correlation Coefficient: 0.10011"
## [1] "p-value: 0.031819"
## [1] " "
## [1] " "
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).


## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## [1] "Explanatory var: ssi"
## [1] "Response var: outcosts2"
## [1] "Pearson Correlation Coefficient: 0.24697"
## [1] "p-value: 9.4751e-08"
## [1] " "
## [1] " "
## Warning: Removed 6 rows containing non-finite values (stat_smooth).
## Warning: Removed 6 rows containing missing values (geom_point).


## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## [1] "Explanatory var: iadl"
## [1] "Response var: outcosts2"
## [1] "Pearson Correlation Coefficient: -0.28106"
## [1] "p-value: 8.1216e-10"
## [1] " "
## [1] " "


## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## [1] "Explanatory var: social"
## [1] "Response var: outcosts2"
## [1] "Pearson Correlation Coefficient: -0.19484"
## [1] "p-value: 2.632e-05"
## [1] " "
## [1] " "
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).


## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## [1] "Explanatory var: charlson"
## [1] "Response var: outcosts2"
## [1] "Pearson Correlation Coefficient: 0.33573"
## [1] "p-value: 1.3139e-13"
## [1] " "
## [1] " "


## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Variable selection
#
# FS <- regsubsets(as.formula(LinEq), data = cost, method = "forward", force.in = c("depcat","charlson"))
#
# str(summary(FS))
# str(FS)
#
# summary(FS)$rss
null <- lm(outcosts2~1, data = cost)
full <- lm(outcosts2~., data = cost)
FS <- step(null, scope=list(lower=null, upper=full), direction="forward")
## Start: AIC=6914.57
## outcosts2 ~ 1
##
## Df Sum of Sq RSS AIC
## + charlson 1 240985083 1938832720 6864.0
## + iadl 1 163770786 2016047017 6881.5
## + ssi 1 133484030 2046333773 6888.2
## + racecat 1 124486863 2055330940 6890.2
## + depcat 1 96034963 2083782841 6896.3
## + educat 1 92820171 2086997633 6897.0
## + social 1 81518772 2098299031 6899.5
## + whiteleycat 1 80410942 2099406861 6899.7
## + panic 1 34614794 2145203009 6909.4
## + anxiety 1 24581357 2155236447 6911.5
## + ptage 1 20088860 2159728943 6912.4
## + female 1 13917217 2165900586 6913.7
## + married 1 13095811 2166721993 6913.9
## <none> 2179817803 6914.6
##
## Step: AIC=6863.97
## outcosts2 ~ charlson
##
## Df Sum of Sq RSS AIC
## + ssi 1 85455902 1853376818 6845.7
## + iadl 1 82639449 1856193271 6846.4
## + racecat 1 76178757 1862653963 6848.0
## + depcat 1 62334119 1876498601 6851.3
## + whiteleycat 1 52109860 1886722860 6853.7
## + educat 1 49609668 1889223052 6854.3
## + social 1 45657600 1893175121 6855.3
## + anxiety 1 23073440 1915759280 6860.6
## + panic 1 20666597 1918166124 6861.2
## + female 1 19857339 1918975382 6861.3
## + married 1 18252581 1920580139 6861.7
## <none> 1938832720 6864.0
## + ptage 1 185287 1938647434 6865.9
##
## Step: AIC=6845.73
## outcosts2 ~ charlson + ssi
##
## Df Sum of Sq RSS AIC
## + racecat 1 60285654 1793091165 6832.9
## + iadl 1 21990226 1831386592 6842.4
## + educat 1 20888173 1832488646 6842.6
## + depcat 1 20482683 1832894136 6842.7
## + female 1 13988501 1839388318 6844.3
## + married 1 9239284 1844137534 6845.5
## <none> 1853376818 6845.7
## + whiteleycat 1 7309692 1846067127 6846.0
## + social 1 5917369 1847459450 6846.3
## + anxiety 1 4488975 1848887843 6846.6
## + panic 1 2688754 1850688064 6847.1
## + ptage 1 1212095 1852164723 6847.4
##
## Step: AIC=6832.88
## outcosts2 ~ charlson + ssi + racecat
##
## Df Sum of Sq RSS AIC
## + depcat 1 18609227 1774481937 6830.2
## + educat 1 14815977 1778275188 6831.2
## + iadl 1 14741752 1778349413 6831.2
## + female 1 13779113 1779312051 6831.4
## + married 1 12542302 1780548863 6831.7
## <none> 1793091165 6832.9
## + whiteleycat 1 6267831 1786823334 6833.3
## + ptage 1 6075168 1787015997 6833.4
## + social 1 5680601 1787410564 6833.5
## + anxiety 1 4302180 1788788984 6833.8
## + panic 1 3881596 1789209569 6833.9
##
## Step: AIC=6830.2
## outcosts2 ~ charlson + ssi + racecat + depcat
##
## Df Sum of Sq RSS AIC
## + iadl 1 12767888 1761714049 6829.0
## + female 1 10797743 1763684194 6829.5
## + married 1 10520394 1763961543 6829.5
## + educat 1 9655606 1764826331 6829.7
## <none> 1774481937 6830.2
## + ptage 1 3827942 1770653995 6831.2
## + social 1 2152371 1772329566 6831.7
## + whiteleycat 1 1062997 1773418940 6831.9
## + panic 1 746116 1773735821 6832.0
## + anxiety 1 375237 1774106700 6832.1
##
## Step: AIC=6828.96
## outcosts2 ~ charlson + ssi + racecat + depcat + iadl
##
## Df Sum of Sq RSS AIC
## + female 1 9855027 1751859022 6828.4
## + married 1 8715904 1752998145 6828.7
## <none> 1761714049 6829.0
## + ptage 1 6217321 1755496729 6829.4
## + educat 1 6146735 1755567315 6829.4
## + social 1 1390391 1760323659 6830.6
## + whiteleycat 1 895128 1760818921 6830.7
## + panic 1 509845 1761204204 6830.8
## + anxiety 1 503253 1761210796 6830.8
##
## Step: AIC=6828.44
## outcosts2 ~ charlson + ssi + racecat + depcat + iadl + female
##
## Df Sum of Sq RSS AIC
## <none> 1751859022 6828.4
## + educat 1 5304184 1746554838 6829.1
## + married 1 5231307 1746627714 6829.1
## + ptage 1 4679154 1747179868 6829.2
## + social 1 1411315 1750447707 6830.1
## + anxiety 1 788709 1751070313 6830.2
## + whiteleycat 1 552418 1751306604 6830.3
## + panic 1 410648 1751448374 6830.3
## View best model
summary(FS)
##
## Call:
## lm(formula = outcosts2 ~ charlson + ssi + racecat + depcat +
## iadl + female, data = cost)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4806.9 -1158.9 -529.6 669.9 8624.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2917.841 838.943 3.478 0.000555 ***
## charlson 534.604 91.734 5.828 1.08e-08 ***
## ssi 285.302 175.069 1.630 0.103888
## racecat -279.386 77.138 -3.622 0.000326 ***
## depcat -309.718 164.689 -1.881 0.060680 .
## iadl -7.997 4.630 -1.727 0.084814 .
## female 341.034 216.276 1.577 0.115545
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1991 on 442 degrees of freedom
## Multiple R-squared: 0.1963, Adjusted R-squared: 0.1854
## F-statistic: 18 on 6 and 442 DF, p-value: < 2.2e-16
## Create vector of selected variables
selectedVars <- rownames(summary(FS)$coefficients)
selectedVars <- selectedVars[!selectedVars %in% c("(Intercept)")]
selectedVars
## [1] "charlson" "ssi" "racecat" "depcat" "iadl" "female"
summary(FS)$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2917.840536 838.942636 3.477998 5.552137e-04
## charlson 534.604127 91.733723 5.827782 1.083905e-08
## ssi 285.302090 175.069404 1.629651 1.038876e-01
## racecat -279.386184 77.137852 -3.621908 3.263541e-04
## depcat -309.717995 164.688760 -1.880626 6.068003e-02
## iadl -7.996864 4.629716 -1.727290 8.481415e-02
## female 341.034392 216.275689 1.576850 1.155454e-01
Multicollinearity: variance inflation factors
LinEq <- paste0("outcosts2~",paste0(selectedVars, collapse = "+"))
LinEq
## [1] "outcosts2~charlson+ssi+racecat+depcat+iadl+female"
LinMod <- lm(as.formula(LinEq), data = cost)
summary(LinMod)
##
## Call:
## lm(formula = as.formula(LinEq), data = cost)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4806.9 -1158.9 -529.6 669.9 8624.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2917.841 838.943 3.478 0.000555 ***
## charlson 534.604 91.734 5.828 1.08e-08 ***
## ssi 285.302 175.069 1.630 0.103888
## racecat -279.386 77.138 -3.622 0.000326 ***
## depcat -309.718 164.689 -1.881 0.060680 .
## iadl -7.997 4.630 -1.727 0.084814 .
## female 341.034 216.276 1.577 0.115545
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1991 on 442 degrees of freedom
## Multiple R-squared: 0.1963, Adjusted R-squared: 0.1854
## F-statistic: 18 on 6 and 442 DF, p-value: < 2.2e-16
vif(LinMod)
## charlson ssi racecat depcat iadl female
## 1.096209 1.670461 1.053903 1.236142 1.603067 1.020977
## Remove if VIF > 10 - none in this case
## Now create table for adjusted coefficients
coefTable <-
data.frame(
variable =
rownames(summary(LinMod)$coefficients)[2:length(summary(LinMod)$coefficients[,1])],
coefficient =
as.numeric(summary(LinMod)$coefficients[2:length(summary(LinMod)$coefficients[,1]),1]),
LCI =
as.numeric(confint(LinMod)[2:length(summary(LinMod)$coefficients[,1]),1]),
UCI = as.numeric(confint(LinMod)[2:length(summary(LinMod)$coefficients[,1]),2]),
pvalue =
signif(as.numeric(summary(LinMod)$coefficients[2:length(summary(LinMod)$coefficients[,1]),4]),9))
kable(coefTable)
| charlson |
534.604127 |
354.31566 |
714.892595 |
0.0000000 |
| ssi |
285.302090 |
-58.76979 |
629.373972 |
0.1038876 |
| racecat |
-279.386184 |
-430.98872 |
-127.783646 |
0.0003264 |
| depcat |
-309.717995 |
-633.38832 |
13.952333 |
0.0606800 |
| iadl |
-7.996864 |
-17.09586 |
1.102129 |
0.0848141 |
| female |
341.034392 |
-84.02208 |
766.090863 |
0.1155454 |
Univariate OLS
coefTable.uni <- data.frame(variable = character(),coefficient = numeric(), LCI = numeric(), UCI = numeric(), pvalue = numeric())
for(i in 1:length(selectedVars)) {
LinEq.uni <- paste0("outcosts2~",selectedVars[i])
LinEq.uni
LinMod.uni <- lm(as.formula(LinEq.uni), data = cost)
coefTable.uni <- rbind(coefTable.uni, data.frame(variable = selectedVars[i],
coefficient = as.numeric(summary(LinMod.uni)$coefficients[2,1]),
LCI = confint(LinMod.uni)[2,1],
UCI = confint(LinMod.uni)[2,2],
pvalue = signif(summary(LinMod.uni)$coefficients[2,4], 9)
))
}
kable(coefTable.uni)
| charlson |
683.18615 |
503.05618 |
863.31612 |
0.0000000 |
| ssi |
786.08278 |
499.98553 |
1072.18004 |
0.0000001 |
| racecat |
-421.10505 |
-580.15788 |
-262.05222 |
0.0000003 |
| depcat |
-729.13175 |
-1044.84251 |
-413.42099 |
0.0000073 |
| iadl |
-23.50489 |
-31.17078 |
-15.83901 |
0.0000000 |
| female |
401.08591 |
-64.02046 |
866.19228 |
0.0908156 |
Table 2: coefficients
table2 <- bind_cols(coefTable.uni,coefTable)
table2 <- as.data.frame(sapply(table2 %>% dplyr::select(-variable,-variable1), function(x) signif(x, 5)))
kable(table2)
| 683.190 |
503.060 |
863.320 |
0.0000000 |
534.6000 |
354.320 |
714.8900 |
0.0000000 |
| 786.080 |
499.990 |
1072.200 |
0.0000001 |
285.3000 |
-58.770 |
629.3700 |
0.1038900 |
| -421.110 |
-580.160 |
-262.050 |
0.0000003 |
-279.3900 |
-430.990 |
-127.7800 |
0.0003264 |
| -729.130 |
-1044.800 |
-413.420 |
0.0000073 |
-309.7200 |
-633.390 |
13.9520 |
0.0606800 |
| -23.505 |
-31.171 |
-15.839 |
0.0000000 |
-7.9969 |
-17.096 |
1.1021 |
0.0848140 |
| 401.090 |
-64.020 |
866.190 |
0.0908160 |
341.0300 |
-84.022 |
766.0900 |
0.1155500 |
## if p < 0.0001
table2[,c("pvalue","pvalue1")] <- sapply(table2[,c("pvalue","pvalue1")], function(x) ifelse(x < 0.0001, "< 0.0001",x))
## Create vector rownames and extract labels
variableNames <- selectedVars
variableNameLabels <- vector(length = length(variableNames))
for(y in 1:length(variableNameLabels)) {
variableNameLabels[y] <- labelled::var_label(cost[,variableNames[y]])
}
## Collapse 95% CIs into cell with hazard ratios
table2 <- table2 %>% mutate(variable = variableNameLabels,
Coef1 = paste0(coefficient," (",LCI,",",UCI,")"),
Coef2 = paste0(coefficient1," (",LCI1,",",UCI1,")")) %>% dplyr::select(variable,Coef1,pvalue,Coef2,pvalue1)
## Warning: package 'bindrcpp' was built under R version 3.4.4
colnames(table2) <- c("Variable", "Unadjusted Coefficient (95% CI)","p-value","Adjusted Coefficient (95% CI)","p-value")
kable(table2)
| Charlson index |
683.19 (503.06,863.32) |
< 0.0001 |
534.6 (354.32,714.89) |
< 0.0001 |
| Somatic Symptom Index (No. (%)) |
786.08 (499.99,1072.2) |
< 0.0001 |
285.3 (-58.77,629.37) |
0.10389 |
| Race (No. (%)) |
-421.11 (-580.16,-262.05) |
< 0.0001 |
-279.39 (-430.99,-127.78) |
0.00032635 |
| Depression Category (No. (%)) |
-729.13 (-1044.8,-413.42) |
< 0.0001 |
-309.72 (-633.39,13.952) |
0.06068 |
| IADL score |
-23.505 (-31.171,-15.839) |
< 0.0001 |
-7.9969 (-17.096,1.1021) |
0.084814 |
| Female (No. (%)) |
401.09 (-64.02,866.19) |
0.090816 |
341.03 (-84.022,766.09) |
0.11555 |
tab_df(table2, file = paste0(analysisDir,"Table2.doc"))
|
Variable
|
Unadjusted Coefficient (95% CI)
|
p-value
|
Adjusted Coefficient (95% CI)
|
p-value
|
|
Charlson index
|
683.19 (503.06,863.32)
|
< 0.0001
|
534.6 (354.32,714.89)
|
< 0.0001
|
|
Somatic Symptom Index (No. (%))
|
786.08 (499.99,1072.2)
|
< 0.0001
|
285.3 (-58.77,629.37)
|
0.10389
|
|
Race (No. (%))
|
-421.11 (-580.16,-262.05)
|
< 0.0001
|
-279.39 (-430.99,-127.78)
|
0.00032635
|
|
Depression Category (No. (%))
|
-729.13 (-1044.8,-413.42)
|
< 0.0001
|
-309.72 (-633.39,13.952)
|
0.06068
|
|
IADL score
|
-23.505 (-31.171,-15.839)
|
< 0.0001
|
-7.9969 (-17.096,1.1021)
|
0.084814
|
|
Female (No. (%))
|
401.09 (-64.02,866.19)
|
0.090816
|
341.03 (-84.022,766.09)
|
0.11555
|
Effect modification
## Evaluate for effect modification by adding an interaction term
cost <- cost %>% mutate(chdep = charlson*depcat)
labelled::var_label(cost$chdep) <- 'Charlson*Depression Category Interaction Term' # label this new term
## Retrain the model with the new interaction term
LinEq <- paste0("outcosts2~",paste0(c(selectedVars,"chdep"), collapse = "+"))
LinEq
## [1] "outcosts2~charlson+ssi+racecat+depcat+iadl+female+chdep"
LinMod <- lm(as.formula(LinEq), data = cost)
summary(LinMod)
##
## Call:
## lm(formula = as.formula(LinEq), data = cost)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5443.0 -1169.9 -518.4 664.3 8610.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2755.335 863.067 3.192 0.00151 **
## charlson 783.175 321.234 2.438 0.01516 *
## ssi 278.822 175.322 1.590 0.11247
## racecat -284.280 77.406 -3.673 0.00027 ***
## depcat -235.211 188.834 -1.246 0.21358
## iadl -8.195 4.638 -1.767 0.07793 .
## female 340.123 216.364 1.572 0.11667
## chdep -97.323 120.532 -0.807 0.41984
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1992 on 441 degrees of freedom
## Multiple R-squared: 0.1975, Adjusted R-squared: 0.1848
## F-statistic: 15.51 on 7 and 441 DF, p-value: < 2.2e-16
vif(LinMod)
## charlson ssi racecat depcat iadl female chdep
## 13.431841 1.673968 1.060404 1.623903 1.607570 1.021005 13.328804
ggplot(data = cost[,c("outcosts2",selectedVars)],
aes(x = charlson, y = outcosts2,group = as.factor(depcat), color = as.factor(depcat))) +
geom_smooth(method = 'lm') +
ggtitle(" ") + xlab("Charlson Index") + ylab("Outpatient Costs") +
labs(color = "Depression \nCategory\n") +
scale_color_manual(labels = c("None","Minor","Major"), values = c("red","blue","dark green"))

## Save file
pdf(paste0(analysisDir,"Figure1.pdf"))
ggplot(data = cost[,c("outcosts2",selectedVars)],
aes(x = charlson, y = outcosts2,group = as.factor(depcat), color = as.factor(depcat))) +
geom_smooth(method = 'lm') +
ggtitle(" ") + xlab("Charlson Index") + ylab("Outpatient Costs") +
labs(color = "Depression \nCategory\n") +
scale_color_manual(labels = c("None","Minor","Major"), values = c("red","blue","dark green"))
dev.off()
## quartz_off_screen
## 2
Session Info
sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.14
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] bindrcpp_0.2.2 leaps_3.0 car_3.0-0
## [4] carData_3.0-1 stargazer_5.2.1 sjPlot_2.4.1
## [7] tableone_0.9.3 ppcor_1.1 MASS_7.3-49
## [10] lmSupport_2.9.13 lsmeans_2.27-62 e1071_1.6-8
## [13] epiR_0.9-97 survival_2.41-3 moments_0.14
## [16] readxl_1.0.0 knitr_1.20 data.table_1.10.4-3
## [19] forcats_0.3.0 stringr_1.3.0 dplyr_0.7.4
## [22] purrr_0.2.4 readr_1.1.1 tidyr_0.8.0
## [25] tibble_1.4.2 ggplot2_2.2.1 tidyverse_1.2.1
##
## loaded via a namespace (and not attached):
## [1] backports_1.1.2 blme_1.0-4 VGAM_1.0-6
## [4] plyr_1.8.4 lazyeval_0.2.1 sp_1.2-7
## [7] TMB_1.7.13 splines_3.4.3 unmarked_0.12-2
## [10] TH.data_1.0-8 digest_0.6.15 htmltools_0.3.6
## [13] gdata_2.18.0 magrittr_1.5 AICcmodavg_2.1-1
## [16] openxlsx_4.0.17 modelr_0.1.1 sandwich_2.4-0
## [19] colorspace_1.3-2 rvest_0.3.2 BiasedUrn_1.07
## [22] haven_1.1.1 crayon_1.3.4 jsonlite_1.5
## [25] lme4_1.1-17 bindr_0.1.1 zoo_1.8-1
## [28] glue_1.2.0 gtable_0.2.0 emmeans_1.1.3
## [31] sjstats_0.14.2-3 sjmisc_2.7.1 abind_1.4-5
## [34] scales_0.5.0 mvtnorm_1.0-7 ggeffects_0.3.2
## [37] Rcpp_0.12.16 xtable_1.8-2 merTools_0.3.0
## [40] foreign_0.8-69 stats4_3.4.3 prediction_0.2.0
## [43] survey_3.33-2 DT_0.4 htmlwidgets_1.0
## [46] httr_1.3.1 gplots_3.0.1 modeltools_0.2-21
## [49] pkgconfig_2.0.1 reshape_0.8.7 nnet_7.3-12
## [52] utf8_1.1.3 tidyselect_0.2.4 labeling_0.3
## [55] rlang_0.2.0 reshape2_1.4.3 later_0.7.4
## [58] munsell_0.4.3 cellranger_1.1.0 tools_3.4.3
## [61] cli_1.0.0 sjlabelled_1.0.8 broom_0.4.4
## [64] ggridges_0.5.0 evaluate_0.10.1 arm_1.9-3
## [67] yaml_2.1.18 caTools_1.17.1 coin_1.2-2
## [70] nlme_3.1-137 mime_0.5 xml2_1.2.0
## [73] compiler_3.4.3 pbkrtest_0.4-7 bayesplot_1.5.0
## [76] rstudioapi_0.7 curl_3.2 stringi_1.1.7
## [79] highr_0.6 lattice_0.20-35 Matrix_1.2-13
## [82] psych_1.8.3.3 nloptr_1.0.4 effects_4.0-1
## [85] stringdist_0.9.4.7 pillar_1.2.1 pwr_1.2-2
## [88] lmtest_0.9-36 estimability_1.3 bitops_1.0-6
## [91] raster_2.6-7 httpuv_1.4.5 R6_2.2.2
## [94] promises_1.0.1 KernSmooth_2.23-15 rio_0.5.10
## [97] codetools_0.2-15 gtools_3.5.0 assertthat_0.2.0
## [100] rprojroot_1.3-2 mnormt_1.5-5 multcomp_1.4-8
## [103] parallel_3.4.3 hms_0.4.2 grid_3.4.3
## [106] labelled_1.0.1 coda_0.19-1 glmmTMB_0.2.0
## [109] class_7.3-14 minqa_1.2.4 rmarkdown_1.9
## [112] snakecase_0.9.1 gvlma_1.0.0.2 shiny_1.1.0
## [115] lubridate_1.7.3